Loading files

rm(list = ls())
graphics.off()

library(DepthProc)
Loading required package: ggplot2
Loading required package: Rcpp
Loading required package: rrcov
Loading required package: robustbase
Scalable Robust Estimators with High Breakdown Point (version 1.7-1)

Loading required package: MASS
Loading required package: np
Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-14)
[vignette("np_faq",package="np") provides answers to frequently asked questions]
[vignette("np",package="np") an overview]
[vignette("entropy_np",package="np") an overview of entropy-based methods]

Attaching package: ‘DepthProc’

The following object is masked from ‘package:base’:

    as.matrix
library(aplpack)

load("~/Documents/thesis/features.Rdata")
load("~/Documents/thesis/lakes.Rdata")
pplot = function(data, x = NULL, n = NULL, col = c("gray"), xlab = "", ylab = "", main = "", mean = FALSE, lwd = c(1), alfa = c(1)){
  library(scales)
  if (length(n) == 0)
    n = dim(data)[1]
  if (length(alfa) == 0)
    alfa = rep(alfa, n)
  if (length(x)==0)
    x = seq(1,dim(data)[2],by=1)
  if (length(col)==1)
    col = rep(col, n)
  if (length(lwd)==1)
    lwd = rep(lwd, n)
  plot(x, data[1,], type = "l", lwd = lwd[1], col = alpha(col[1], alfa[1]), xlab = ylab, ylab = ylab, main = main, ylim = c(min(data), max(data)))
  for (i in 2:n){
    lines(x, data[i,], col = alpha(col[i], alfa[i]), lwd = lwd[i])
  }
  if (mean){
    par(new=TRUE)
    plot(x,apply(data, 2, mean), lwd = lwd[1], type = "l", col = "black", xlab = ylab, ylab = ylab, main = main, ylim = c(min(data), max(data)))
  }
}

f_kass = data.frame("spring.peak" = f_kass[1,], "avg.l1.der" = f_kass[2,], "avg.l1.mean" = f_kass[3,])
f_kort = data.frame("spring.peak" = f_kort[1,], "avg.l1.der" = f_kort[2,], "avg.l1.mean" = f_kort[3,])
f_naut = data.frame("spring.peak" = f_naut[1,], "avg.l1.der" = f_naut[2,], "avg.l1.mean" = f_naut[3,])

Because of the fact that the third variable has a much wider range than the other two, before starting the analysis, the entire datasets are rescaled in [0,1]:

for (j in 1:dim(f_kass)[2]){
  f_kass[,j] = (f_kass[,j] - min(f_kass[,j]))/(max(f_kass[,j])-min(f_kass[,j]))
  f_kort[,j] = (f_kort[,j] - min(f_kort[,j]))/(max(f_kort[,j])-min(f_kort[,j]))
  f_naut[,j] = (f_naut[,j] - min(f_naut[,j]))/(max(f_naut[,j])-min(f_naut[,j]))
}

The features are:

  1. spring.peak: \(max_{t\in[0,1]}f(t)\)
  2. avg.l1.der: \(\int_{0}^1 |f^\prime(t)|dt\)
  3. avg.l1.mean: \(\int_{0}^1 |f(t) - \mu_{lake}(t)|dt\)

Kass

d = f_kass
bagplot_ = bagplot.pairs(d)

the number of outliers is very low:

par(mfrow=c(1,4))
boxplot(d)
hist(d$spring.peak)
hist(d$avg.l1.der)
hist(d$avg.l1.mean)

We now perform PCA (on the rescaled dataset). The first PC explains the majority of the variability:

pc.data <- princomp(d, scores=T)
# summary(pc.data)
par(mfcol = c(1,3))
load.data <- pc.data$loadings
for(i in 1:3) barplot(load.data[,i], ylim = c(-1, 1), main=paste("PC",i))

The first PC is a mean of the 3 features, while the second one seems to highlight a contrast between the presence of a high spring peak and the overall “dissimilarity” of the curve wrt to the mean function.

layout(matrix(c(2,3,1,3),2,byrow=T))
plot(pc.data, las=2, main='Principal Components')
abline(h=1, col='blue')
barplot(sapply(as.data.frame(d),sd)^2, las=2, main='Original Variables', ylab='Variances')
plot(cumsum(pc.data$sde^2)/sum(pc.data$sde^2), type='b', axes=F, xlab='Number of components', ylab='Contribution to the total variance', ylim=c(0,1))
abline(h=1, col='blue')
abline(h=0.9, lty=2, col='blue')
box()
axis(2,at=0:10/10,labels=0:10/10)
par(mfrow = c(1,1))

For future analysis, we could simplify this dataset by only focusing on the first 2 PC (considering also the fact that the features are highly correlated):

plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19, xlab = "PC1", ylab = "PC2")

It seems like most years have a low PC1 and a high PC2. Why is that?

plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19)
pc1.weird = which(pc.data$scores[,1]>0.7)
pc2.weird = which(pc.data$scores[,2]< -0.3)
points(pc.data$scores[pc1.weird, 1],
       pc.data$scores[pc1.weird, 2], col = "red", pch = 19)
points(pc.data$scores[pc2.weird, 1],
       pc.data$scores[pc2.weird, 2], col = "green", pch = 19)

Lets plot the functions which have a high/low PC1:

lake = kass
d_lake = d_kass
d = f_kass

color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)

pc1.weird = which(pc.data$scores[,1]< -0.35)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)

A high PC1 corresponds to functions whose oscillations throughout the year are particularly high (in absolute value), i.e. years in which the previous winter was particularly warm and there were many heavy rains in autumn. If the first PC is low, then the signals are almost constant (previous winter was warm and there were almost no rains). Lets plot the functions which have a low PC2:

color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)

A low PC2 means that the first peak (spring peak) is actually quite low (cold winters), but the signals have quite high peaks during the winter (due possibly to heavy rains). The vast majority of the years, have a high PC2:

pc2.weird = which(pc.data$scores[,2]> 0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)

Indeed, these years are very similar to the mean.

In short: - PC1 talks about peakS intensities: the higher, the more intense the peakS (also the ones in the end of the year); the lower, the more constant the overall signal. - PC2 talks about the autumn/winter of the current year: if low then the year has many autumn peaks, if high then the function is very similar to the mean, hence it has only one peak (which happens during the spring).

Kort

Performing the same analysis on the Kort dataset, the bagplots highlight the presence of many more ourliers (due to the fact that the distribution of the 3 features seems much more concentrated) and the correlation among spring.peak and avg.l1.der seems to be even more clear. As with Kass, the first 2 PC explain the vast majority of the variability and their interpretation is the same of Kass (PC1: average of the 3 features and PC2: contrast between spring peak and average displacement from the mean). In this case, the plot of the 2 first PC seems to be sufficient to embed the dataset:

lake = kort
d_lake = d_kort
d = f_kort
plot(princomp(f_kort, scores=T)$scores[,1], princomp(f_kort, scores=T)$scores[,2], pch = 19)

pc.data <- princomp(f_kort, scores=T)
par(mfrow = c(1,1))
plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19)
pc1.weird = which(pc.data$scores[,1]>0.6)
pc2.weird = which(pc.data$scores[,2]< -0.15)
points(pc.data$scores[pc1.weird, 1],
       pc.data$scores[pc1.weird, 2], col = "red", pch = 19)
points(pc.data$scores[pc2.weird, 1],
       pc.data$scores[pc2.weird, 2], col = "green", pch = 19)

color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(1.5, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)

pc1.weird = which(pc.data$scores[,1]< - 0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.2, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)

PC1 has a similar interpretation when compared with Kass: if PC1 is low, then the funcitons are almost constant, while if high, they have quite high spring peaks. Unfortunately, the link between PC1 and the autumn peaks seems to be lost, indeed the role of the aveage abs of variability (first derivative) is not that relevant in Kort’s PC1 when compared with Kass:

par(mfcol = c(1,2))
barplot(princomp(f_kass, scores=T)$loadings[,1], ylim = c(-1, 1), main="PC1 - Kass")
barplot(princomp(f_kort, scores=T)$loadings[,1], ylim = c(-1, 1), main="PC1 - Kort")

About PC2:

pc2.weird = which(pc.data$scores[,2]> 0.1)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)

pc2.weird = which(pc.data$scores[,2]< -0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)

High PC2: then the curve has mainly one peak which happens during spring (some exeptions present prominent peaks even during autumn and winter). Low PC2: much more difficult to interpret, but rememer that the second PC explains much less variability than PC1…

Naut

The fact that the data collection process of Kort and Naut is different from the one of Kass may be the reason for the similarities among the 2 finnish lakes when compared with the swedish one. Even the scatterplot of PC1 and PC2 are similar:

lake = naut
d_lake = d_naut
d = f_naut

pc.data <- princomp(f_naut, scores=T)
par(mfrow = c(1,1))
plot(princomp(f_naut, scores=T)$scores[,1], princomp(f_naut, scores=T)$scores[,2], pch = 19)

The barplot of the loadings of the PCA for naut is very similar to Kass. Let’s look at PC1:

pc1.weird = which(pc.data$scores[,1]> 0.5)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.2, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)

pc1.weird = which(pc.data$scores[,1]< -0.19)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)

PC1 has the exact same interpretation as lake Kass, contrary to Kort… Let’s look at PC2:

pc2.weird = which(pc.data$scores[,2]< -0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)

pc2.weird = which(pc.data$scores[,2]> 0.09)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)

PC2 has a similar interpretation to PC2 of Kass, but again it is less evident.

Overall: * PC1: peaks intensity (the lower, the more constant; the higher, the more evident the peaks are) * PC2: (the higher, the more a function resembles the mean; the lower, the more shifted the spring peak is)

CONSIDERING THAT KORT AND NAUT ARE MUCH MORE “WIGGLY” THAN KASS AND THAT THE SECOND PC EXPLINS MUCH LESS VARIABILITY THAN THE FIRST, WE STICK WITH THE INTERPRETATION OF PC1 AND PCS BASED ON KASS. MOREOVER, PC2 OF KASS EXPLAINS MORE VARIABILITY THAN PC2 OF KORT AND NAUT:

par(mfrow=c(1,1))
plot(cumsum(princomp(f_kass, scores=T)$sde^2)/sum(princomp(f_kass, scores=T)$sde^2), type='b', axes=F, xlab='Number of components', col = "black", ylab='Contribution to the total variance', ylim=c(0,1))
points(cumsum(princomp(f_kort, scores=T)$sde^2)/sum(princomp(f_kort, scores=T)$sde^2), type='b', xlab='Number of components', col = "blue")
points(cumsum(princomp(f_naut, scores=T)$sde^2)/sum(princomp(f_naut, scores=T)$sde^2), type='b', xlab='Number of components', col = "red")
axis(2,at=0:10/10,labels=0:10/10)
legend(1,0.5, legend=c("kass", "kort", "naut"), col = c("black", "blue", "red"), lty = 1)
box()

HC

lake = kass
d_lake = d_kass
d = f_kass
pc.data <- princomp(d, scores=T)
dd = data.frame("PC1"=pc.data$scores[,1],"PC2"=pc.data$scores[,2])
d <- dist(dd, method = 'euclidean')
hc <- hclust(d, method = "average")
plot(hc, labels=FALSE, xlab="", sub="")
rect.hclust(hc, k = 4, border = "red")
rect.hclust(hc, k = 5, border = "green")
rect.hclust(hc, k = 6, border = "blue")

# table(cutree(hc, k = 3))

By applying hclust, we see that the results are not very satisfying, since the goal would be to cluster climate types and already with k = 3 we see that one cluster is quite scarce (less than 60 years belong to it). Moreover the clustering is completely driven by the first PC:

par(mfrow = c(2,2))
plot(dd, col = cutree(hc, k = 3), main = "k = 3")
plot(dd, col = cutree(hc, k = 4), main = "k = 4")
plot(dd, col = cutree(hc, k = 5), main = "k = 5")
plot(dd, col = cutree(hc, k = 6), main = "k = 6")
par(mfrow = c(1,1))

Exactly the same results hold for the other 2 finnish lakes.

ConfromalClustering

Very bad

K-means

[The following results are applied to Kass, but they can be generalized to the finnish lakes] By applying k-means with k = 3, we see a clar improvement when compared to HC (with 3 groups):

lake = kass
d_lake = d_kass
d = f_kass
pc.data <- princomp(d, scores=T)
dd = data.frame("PC1"=pc.data$scores[,1],"PC2"=pc.data$scores[,2])
km = kmeans(dd, centers = 3, nstart = 20)
table(km$cluster)

   1    2    3 
2209 2060 1002 

Still, PC1 seems to be the most relevant component:

par(mfrow = c(3,2))
plot(dd, col = kmeans(dd, centers = 3, nstart = 20)$cluster, main = "k = 3")
plot(dd, col = kmeans(dd, centers = 4, nstart = 20)$cluster, main = "k = 4")
plot(dd, col = kmeans(dd, centers = 5, nstart = 20)$cluster, main = "k = 5")
plot(dd, col = kmeans(dd, centers = 6, nstart = 20)$cluster, main = "k = 6")
plot(dd, col = kmeans(dd, centers = 7, nstart = 20)$cluster, main = "k = 7")
plot(dd, col = kmeans(dd, centers = 8, nstart = 20)$cluster, main = "k = 8")
par(mfrow = c(1,1))

If we tried to include the third PC in the analysis, we would obtain slighlty different results:

ddd = dd
ddd$PC3 = pc.data$scores[,3]
par(mfrow = c(3,2))
plot(ddd, col = kmeans(ddd, centers = 3, nstart = 20)$cluster, main = "k = 3")

plot(ddd, col = kmeans(ddd, centers = 4, nstart = 20)$cluster, main = "k = 4")

plot(ddd, col = kmeans(ddd, centers = 5, nstart = 20)$cluster, main = "k = 5")

plot(ddd, col = kmeans(ddd, centers = 6, nstart = 20)$cluster, main = "k = 6")

plot(ddd, col = kmeans(ddd, centers = 7, nstart = 20)$cluster, main = "k = 7")

plot(ddd, col = kmeans(ddd, centers = 8, nstart = 20)$cluster, main = "k = 8")
par(mfrow = c(1,1))

But still PC1 is the driving factor and, moreover, by considering 3 PC we are not really performing dimensionality reduction. Moreover, the 3rd PC always explains a very low amount of variability:

summary(pc.data)
Importance of components:
                          Comp.1    Comp.2     Comp.3
Standard deviation     0.2421391 0.0934147 0.05203741
Proportion of Variance 0.8368071 0.1245449 0.03864798
Cumulative Proportion  0.8368071 0.9613520 1.00000000

For this reason, we decide not to include PC3 in the analysis. Finally, clearly Kmeans is more satisfying than hclust unless k is small (roughly < 6).

BaggingVoronoi:

K = 3, L = 100, B = 500 (2 minutes)

K = 6, L = 10, B = 500 (4 minutes)

---
title: "Mutlivariate analysis of lake Kassjön, Korttajärvi and Nautajärvi"
output: html_notebook
---

![](/Users/disa/Desktop/Screenshot3.png)

# Loading files
```{r}
rm(list = ls())
graphics.off()

library(DepthProc)
library(aplpack)

load("~/Documents/thesis/features.Rdata")
load("~/Documents/thesis/lakes.Rdata")
pplot = function(data, x = NULL, n = NULL, col = c("gray"), xlab = "", ylab = "", main = "", mean = FALSE, lwd = c(1), alfa = c(1)){
  library(scales)
  if (length(n) == 0)
    n = dim(data)[1]
  if (length(alfa) == 0)
    alfa = rep(alfa, n)
  if (length(x)==0)
    x = seq(1,dim(data)[2],by=1)
  if (length(col)==1)
    col = rep(col, n)
  if (length(lwd)==1)
    lwd = rep(lwd, n)
  plot(x, data[1,], type = "l", lwd = lwd[1], col = alpha(col[1], alfa[1]), xlab = ylab, ylab = ylab, main = main, ylim = c(min(data), max(data)))
  for (i in 2:n){
    lines(x, data[i,], col = alpha(col[i], alfa[i]), lwd = lwd[i])
  }
  if (mean){
    par(new=TRUE)
    plot(x,apply(data, 2, mean), lwd = lwd[1], type = "l", col = "black", xlab = ylab, ylab = ylab, main = main, ylim = c(min(data), max(data)))
  }
}

f_kass = data.frame("spring.peak" = f_kass[1,], "avg.l1.der" = f_kass[2,], "avg.l1.mean" = f_kass[3,])
f_kort = data.frame("spring.peak" = f_kort[1,], "avg.l1.der" = f_kort[2,], "avg.l1.mean" = f_kort[3,])
f_naut = data.frame("spring.peak" = f_naut[1,], "avg.l1.der" = f_naut[2,], "avg.l1.mean" = f_naut[3,])

```
Because of the fact that the third variable has a much wider range than the other two, before starting the analysis, the entire datasets are rescaled in [0,1]:
```{r}
for (j in 1:dim(f_kass)[2]){
  f_kass[,j] = (f_kass[,j] - min(f_kass[,j]))/(max(f_kass[,j])-min(f_kass[,j]))
  f_kort[,j] = (f_kort[,j] - min(f_kort[,j]))/(max(f_kort[,j])-min(f_kort[,j]))
  f_naut[,j] = (f_naut[,j] - min(f_naut[,j]))/(max(f_naut[,j])-min(f_naut[,j]))
}
```
The features are:

1. spring.peak: $max_{t\in[0,1]}f(t)$
2. avg.l1.der: $\int_{0}^1 |f^\prime(t)|dt$
3. avg.l1.mean: $\int_{0}^1 |f(t) - \mu_{lake}(t)|dt$

## Kass
```{r}
d = f_kass
bagplot_ = bagplot.pairs(d)
```
the number of outliers is very low:
```{r}
par(mfrow=c(1,4))
boxplot(d)
hist(d$spring.peak)
hist(d$avg.l1.der)
hist(d$avg.l1.mean)
```
We now perform PCA (on the rescaled dataset). The first PC explains the majority of the variability:
```{r}
pc.data <- princomp(d, scores=T)
# summary(pc.data)
par(mfcol = c(1,3))
load.data <- pc.data$loadings
for(i in 1:3) barplot(load.data[,i], ylim = c(-1, 1), main=paste("PC",i))
```
The first PC is a mean of the 3 features, while the second one seems to highlight a contrast between the presence of a high spring peak and the overall "dissimilarity" of the curve wrt to the mean function. 
```{r}
layout(matrix(c(2,3,1,3),2,byrow=T))
plot(pc.data, las=2, main='Principal Components')
abline(h=1, col='blue')
barplot(sapply(as.data.frame(d),sd)^2, las=2, main='Original Variables', ylab='Variances')
plot(cumsum(pc.data$sde^2)/sum(pc.data$sde^2), type='b', axes=F, xlab='Number of components', ylab='Contribution to the total variance', ylim=c(0,1))
abline(h=1, col='blue')
abline(h=0.9, lty=2, col='blue')
box()
axis(2,at=0:10/10,labels=0:10/10)
par(mfrow = c(1,1))
```
For future analysis, we could simplify this dataset by only focusing on the first 2 PC (considering also the fact that the features are highly correlated):
```{r}
plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19, xlab = "PC1", ylab = "PC2")
```
It seems like most years have a low PC1 and a high PC2. Why is that?
```{r}
plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19)
pc1.weird = which(pc.data$scores[,1]>0.7)
pc2.weird = which(pc.data$scores[,2]< -0.3)
points(pc.data$scores[pc1.weird, 1],
       pc.data$scores[pc1.weird, 2], col = "red", pch = 19)
points(pc.data$scores[pc2.weird, 1],
       pc.data$scores[pc2.weird, 2], col = "green", pch = 19)
```
Lets plot the functions which have a high/low PC1:
```{r}
lake = kass
d_lake = d_kass
d = f_kass

color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)
```
```{r}
pc1.weird = which(pc.data$scores[,1]< -0.35)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)
```
A high PC1 corresponds to functions whose oscillations throughout the year are particularly high (in absolute value), i.e. years in which the previous winter was particularly warm and there were many heavy rains in autumn. If the first PC is low, then the signals are almost constant (previous winter was warm and there were almost no rains).
Lets plot the functions which have a low PC2:
```{r}
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)
```
A low PC2 means that the first peak (spring peak) is actually quite low (cold winters), but the signals have quite high peaks during the winter (due possibly to heavy rains). The vast majority of the years, have a high PC2:
```{r}
pc2.weird = which(pc.data$scores[,2]> 0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)
```
Indeed, these years are very similar to the mean.

In short: 
- PC1 talks about peakS intensities: the higher, the more intense the peakS (also the ones in the end of the year); the lower, the more constant the overall signal.
- PC2 talks about the autumn/winter of the current year: if low then the year has many autumn peaks, if high then the function is very similar to the mean, hence it has only one peak (which happens during the spring).

## Kort
Performing the same analysis on the Kort dataset, the bagplots highlight the presence of many more ourliers (due to the fact that the distribution of the 3 features seems much more concentrated) and the correlation among spring.peak and avg.l1.der seems to be even more clear.
As with Kass, the first 2 PC explain the vast majority of the variability and their interpretation is the same of Kass (PC1: average of the 3 features and PC2: contrast between spring peak and average displacement from the mean). In this case, the plot of the 2 first PC seems to be sufficient to embed the dataset:
```{r}
lake = kort
d_lake = d_kort
d = f_kort
plot(princomp(f_kort, scores=T)$scores[,1], princomp(f_kort, scores=T)$scores[,2], pch = 19)
```
```{r}
pc.data <- princomp(f_kort, scores=T)
par(mfrow = c(1,1))
plot(pc.data$scores[,1], pc.data$scores[,2], pch = 19)
pc1.weird = which(pc.data$scores[,1]>0.6)
pc2.weird = which(pc.data$scores[,2]< -0.15)
points(pc.data$scores[pc1.weird, 1],
       pc.data$scores[pc1.weird, 2], col = "red", pch = 19)
points(pc.data$scores[pc2.weird, 1],
       pc.data$scores[pc2.weird, 2], col = "green", pch = 19)
```

```{r}
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(1.5, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)
```
```{r}
pc1.weird = which(pc.data$scores[,1]< - 0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.2, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)
```
PC1 has a similar interpretation when compared with Kass: if PC1 is low, then the funcitons are almost constant, while if high, they have quite high spring peaks. Unfortunately, the link between PC1 and the autumn peaks seems to be lost, indeed the role of the aveage abs of variability (first derivative) is not that relevant in Kort's PC1 when compared with Kass:
```{r}
par(mfcol = c(1,2))
barplot(princomp(f_kass, scores=T)$loadings[,1], ylim = c(-1, 1), main="PC1 - Kass")
barplot(princomp(f_kort, scores=T)$loadings[,1], ylim = c(-1, 1), main="PC1 - Kort")
```
About PC2:
```{r}
pc2.weird = which(pc.data$scores[,2]> 0.1)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)
```
```{r}
pc2.weird = which(pc.data$scores[,2]< -0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)
```
High PC2: then the curve has mainly one peak which happens during spring (some exeptions present prominent peaks even during autumn and winter).
Low PC2: much more difficult to interpret, but rememer that the second PC explains much less variability than PC1...

## Naut
The fact that the data collection process of Kort and Naut is different from the one of Kass may be the reason for the similarities among the 2 finnish lakes when compared with the swedish one. Even the scatterplot of PC1 and PC2 are similar: 
```{r}
lake = naut
d_lake = d_naut
d = f_naut

pc.data <- princomp(f_naut, scores=T)
par(mfrow = c(1,1))
plot(princomp(f_naut, scores=T)$scores[,1], princomp(f_naut, scores=T)$scores[,2], pch = 19)
```
The barplot of the loadings of the PCA for naut is very similar to Kass.
Let's look at PC1:
```{r}
pc1.weird = which(pc.data$scores[,1]> 0.5)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.2, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 1
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "High PC1", lwd = lwd, alfa = alfa)
```
```{r}
pc1.weird = which(pc.data$scores[,1]< -0.19)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.1, dim(lake)[1])
color[pc1.weird] = "red"
lwd[pc1.weird] = 2
alfa[pc1.weird] = 1
pplot(lake, col = color, main = "Low PC1", lwd = lwd, alfa = alfa)
```
PC1 has the exact same interpretation as lake Kass, contrary to Kort...
Let's look at PC2:
```{r}
pc2.weird = which(pc.data$scores[,2]< -0.15)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "Low PC2", lwd = lwd, alfa = alfa)
```
```{r}
pc2.weird = which(pc.data$scores[,2]> 0.09)
color = rep("gray", dim(lake)[1])
lwd = rep(1, dim(lake)[1])
alfa = rep(0.3, dim(lake)[1])
color[pc2.weird] = "red"
lwd[pc2.weird] = 2
alfa[pc2.weird] = 1
pplot(lake, col = color, main = "High PC2", lwd = lwd, alfa = alfa)
```
PC2 has a similar interpretation to PC2 of Kass, but again it is less evident.

Overall:
* PC1: peaks intensity (the lower, the more constant; the higher, the more evident the peaks are)
* PC2: (the higher, the more a function resembles the mean; the lower, the more shifted the spring peak is)

CONSIDERING THAT KORT AND NAUT ARE MUCH MORE "WIGGLY" THAN KASS AND THAT THE SECOND PC EXPLINS MUCH LESS VARIABILITY THAN THE FIRST, WE STICK WITH THE INTERPRETATION OF PC1 AND PCS BASED ON KASS. MOREOVER, PC2 OF KASS EXPLAINS MORE VARIABILITY THAN PC2 OF KORT AND NAUT:
```{r}
par(mfrow=c(1,1))
plot(cumsum(princomp(f_kass, scores=T)$sde^2)/sum(princomp(f_kass, scores=T)$sde^2), type='b', axes=F, xlab='Number of components', col = "black", ylab='Contribution to the total variance', ylim=c(0,1))
points(cumsum(princomp(f_kort, scores=T)$sde^2)/sum(princomp(f_kort, scores=T)$sde^2), type='b', xlab='Number of components', col = "blue")
points(cumsum(princomp(f_naut, scores=T)$sde^2)/sum(princomp(f_naut, scores=T)$sde^2), type='b', xlab='Number of components', col = "red")
axis(2,at=0:10/10,labels=0:10/10)
legend(1,0.5, legend=c("kass", "kort", "naut"), col = c("black", "blue", "red"), lty = 1)
box()
```



# HC
```{r}
lake = kass
d_lake = d_kass
d = f_kass
pc.data <- princomp(d, scores=T)
dd = data.frame("PC1"=pc.data$scores[,1],"PC2"=pc.data$scores[,2])
d <- dist(dd, method = 'euclidean')
hc <- hclust(d, method = "average")
plot(hc, labels=FALSE, xlab="", sub="")
rect.hclust(hc, k = 4, border = "red")
rect.hclust(hc, k = 5, border = "green")
rect.hclust(hc, k = 6, border = "blue")
# table(cutree(hc, k = 3))
```
By applying hclust, we see that the results are not very satisfying, since the goal would be to cluster climate types and already with k = 3 we see that one cluster is quite scarce (less than 60 years belong to it). Moreover the clustering is completely driven by the first PC:
```{r}
par(mfrow = c(2,2))
plot(dd, col = cutree(hc, k = 3), main = "k = 3")
plot(dd, col = cutree(hc, k = 4), main = "k = 4")
plot(dd, col = cutree(hc, k = 5), main = "k = 5")
plot(dd, col = cutree(hc, k = 6), main = "k = 6")
par(mfrow = c(1,1))
```
Exactly the same results hold for the other 2 finnish lakes. 

# ConfromalClustering 
Very bad

# K-means
[The following results are applied to Kass, but they can be generalized to the finnish lakes]
By applying k-means with k = 3, we see a clar improvement when compared to HC (with 3 groups):

```{r}
lake = kass
d_lake = d_kass
d = f_kass
pc.data <- princomp(d, scores=T)
dd = data.frame("PC1"=pc.data$scores[,1],"PC2"=pc.data$scores[,2])
km = kmeans(dd, centers = 3, nstart = 20)
table(km$cluster)
```
Still, PC1 seems to be the most relevant component:
```{r}
par(mfrow = c(3,2))
plot(dd, col = kmeans(dd, centers = 3, nstart = 20)$cluster, main = "k = 3")
plot(dd, col = kmeans(dd, centers = 4, nstart = 20)$cluster, main = "k = 4")
plot(dd, col = kmeans(dd, centers = 5, nstart = 20)$cluster, main = "k = 5")
plot(dd, col = kmeans(dd, centers = 6, nstart = 20)$cluster, main = "k = 6")
plot(dd, col = kmeans(dd, centers = 7, nstart = 20)$cluster, main = "k = 7")
plot(dd, col = kmeans(dd, centers = 8, nstart = 20)$cluster, main = "k = 8")
par(mfrow = c(1,1))
```
If we tried to include the third PC in the analysis, we would obtain slighlty different results:
```{r}
ddd = dd
ddd$PC3 = pc.data$scores[,3]
par(mfrow = c(3,2))
plot(ddd, col = kmeans(ddd, centers = 3, nstart = 20)$cluster, main = "k = 3")
plot(ddd, col = kmeans(ddd, centers = 4, nstart = 20)$cluster, main = "k = 4")
plot(ddd, col = kmeans(ddd, centers = 5, nstart = 20)$cluster, main = "k = 5")
plot(ddd, col = kmeans(ddd, centers = 6, nstart = 20)$cluster, main = "k = 6")
plot(ddd, col = kmeans(ddd, centers = 7, nstart = 20)$cluster, main = "k = 7")
plot(ddd, col = kmeans(ddd, centers = 8, nstart = 20)$cluster, main = "k = 8")
par(mfrow = c(1,1))
```
But still PC1 is the driving factor and, moreover, by considering 3 PC we are not really performing dimensionality reduction. Moreover, the 3rd PC always explains a very low amount of variability:
```{r}
summary(pc.data)
```
For this reason, we decide not to include PC3 in the analysis.
Finally, clearly Kmeans is more satisfying than hclust unless k is small (roughly < 6).

# BaggingVoronoi:
K = 3, L = 100, B = 500 (2 minutes)
![](/Users/disa/Desktop/Screenshot.png)

K = 6, L = 10, B = 500 (4 minutes)
![](/Users/disa/Desktop/Screenshot2.png)

